This notebook explores group psychotherapy transcripts and survey data provided by Martin Kivlighan in the spring of 2023 and then again in the fall of 2023. It describes:
import os
import re
import glob
import torch
import statistics
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from pdfminer.high_level import extract_text
from transformers import AutoTokenizer, pipeline
# Set sequence lengths for distribution plot and sentiment model
dist_max_seq_len = 32768
sent_max_seq_len = 512
# Set model/tokenizer paths
dist_tokenizer_path = "allenai/longformer-base-4096"
sent_model_path = "j-hartmann/emotion-english-roberta-large"
# Define data directory
pdf_dir = "./data/fall_2023/session_transcripts/"
# Define survey data path
survey_path = "./data/fall_2023/WDS_Session_Survey.xlsx"
# Group leaders
leaders_iter2 = [
"Leader 1 (she/her)",
"Leader 2 (she/her/hers)",
]
leaders_iter3 = [
"Leader 1",
"Leader 3"
]
# Define emotions used in sentiment model
emotions = ["neutral", "sadness", "surprise", "joy", "fear", "disgust", "anger"]
# Plot size
matplotlib.rcParams["figure.figsize"] = (9, 7)
# Set the default color cycle
CB_color_cycle = [
"#377eb8",
"#ff7f00",
"#4daf4a",
"#f781bf",
"#a65628",
"#984ea3",
"#999999",
"#e41a1c",
"#dede00",
"#000000",
"#8DA0CB",
"#A6D854",
"#E5C494",
]
matplotlib.rcParams["axes.prop_cycle"] = matplotlib.cycler(color=CB_color_cycle)
df = pd.read_excel(survey_path)
For now, drop rows where the cohesion or Alliance rating is missing.
df = df[~df[['cohesion', 'Alliance']].isnull().any(axis=1)]
There were some other typos of participants names in the Excel file that I manually fixed.
df['wdscode'] = df['wdscode'].apply(lambda x: x.capitalize())
participant_text_names_iter2 = [x + '_2' for x in df[df['iteration'] == 2]['pseudonym'].unique()]
participant_text_names_iter2
participants_iter2 = df[df['iteration'] == 2]['wdscode'].unique().tolist()
participants_iter2
participants_iter3 = df[df['iteration'] == 3]['wdscode'].unique().tolist()
participants_iter3
score_df = df[
["wdscode",
"pseudonym",
"id",
"iteration",
"session",
"cohesion",
"Alliance"
]
]
text_df = df[["session", "id", "majortakeaway", "topicfdbk", "fdbkforleaders"]]
iter2_score_df = score_df[score_df['iteration'] == 2].sort_values("session")
iter3_score_df = score_df[score_df['iteration'] == 3].sort_values("session")
iter2_score_df.shape
iter3_score_df.shape
def plot_scores_per_participant_per_session(score_df, participants, score_cols, identifier):
# Check that score dataframe is sorted by session
sessions = score_df["session"].tolist()
assert_msg = "Expected score dataframe to be sorted by session."
assert all(sessions[i] <= sessions[i + 1] for i in range(len(sessions) - 1)), assert_msg
# Knowing the sessions are ordered, filter for each session ID for each participant
# And plot the score per session per participant for all scores
for score in score_cols:
fig, ax = plt.subplots()
for participant in participants:
x = score_df[score_df[identifier] == participant]["session"].tolist()
y = score_df[score_df[identifier] == participant][score].tolist()
ax.plot(x, y, label=participant)
# Set plot title and axis labels
ax.set_title(f"Score - {score} - per Session per Participant")
ax.set_xlabel("Session Number")
ax.set_ylabel(f"Score - {score}")
# Set legend
ax.legend(fontsize=6, loc="best")
# Show plot
plt.show()
plot_scores_per_participant_per_session(
score_df=iter2_score_df,
participants=participants_iter2,
score_cols=["cohesion", "Alliance"],
identifier='wdscode'
)
plot_scores_per_participant_per_session(
score_df=iter3_score_df,
participants=participants_iter3,
score_cols=["cohesion", "Alliance"],
identifier='wdscode'
)
dist_tokenizer = AutoTokenizer.from_pretrained(dist_tokenizer_path)
sent_tokenizer = AutoTokenizer.from_pretrained(sent_model_path)
sent_pipeline = pipeline(
"text-classification",
model=sent_model_path,
return_all_scores=True,
truncation=True,
max_length=sent_max_seq_len,
top_k=None,
)
iter2_pdf_files = glob.glob(os.path.join(os.path.join(pdf_dir, 'WDS2'), "*.pdf"))
iter3_pdf_files = glob.glob(os.path.join(os.path.join(pdf_dir, 'WDS3'), "*.pdf"))
def count_words_in_pdfs(pdf_files):
# Loop through each PDF file, convert to plain text, and count the number of words
num_words_list = []
num_subwords_list = []
for pdf_file in pdf_files:
text = extract_text(pdf_file)
num_words = len(text.split())
num_words_list.append(num_words)
tokenized_text = dist_tokenizer(
text, padding=False, truncation=True, max_length=dist_max_seq_len
)
num_subwords = len(tokenized_text["input_ids"])
num_subwords_list.append(num_subwords)
print(
f"{pdf_file} - Number of words: {num_words} | Number of subwords: {num_subwords}"
)
# Calculate descriptive statistics
mean_num_words = statistics.mean(num_words_list)
std_num_words = statistics.stdev(num_words_list)
mean_num_subwords = statistics.mean(num_subwords_list)
std_num_subwords = statistics.stdev(num_subwords_list)
# Print descriptive statistics
print(f"\nNumber of documents: {len(num_words_list)}")
print(f"Mean number of words per document: {mean_num_words:.2f}")
print(f"Standard deviation of number of words per document: {std_num_words:.2f}")
print(f"Mean number of subwords per document: {mean_num_subwords:.2f}")
print(f"Standard deviation of number of subwords per document: {std_num_subwords:.2f}")
# Generate histogram of sequence lengths for all documents
plt.hist(num_words_list, bins=10)
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.title("Number of Words Distribution")
plt.show()
plt.hist(num_subwords_list, bins=10)
plt.xlabel("Number of Subwords")
plt.ylabel("Frequency")
plt.title("Number of Subwords Distribution")
plt.show()
count_words_in_pdfs(iter2_pdf_files)
count_words_in_pdfs(iter3_pdf_files)
def should_remove(line):
line = line.strip()
if line.isdigit() or not line or (line.startswith("{") and line.endswith("}")) or 'WDS Iteration' in line:
return True
def count_words_per_speaker(pdf_files, speakers):
# Save total word counts, speaker word counts, mentions of, and complete utterances
word_counts = {}
speaker_word_counts = {}
speaker_mentions_of = {
speaker: {spoken_to: 0 for spoken_to in speakers} for speaker in speakers
}
speaker_utterances = {
speaker: {num: [] for num in range(len(pdf_files))} for speaker in speakers
}
# Iterate through files to extract words
for i, file in enumerate(pdf_files):
# Reset speaker counts for new file
speaker_counts = {}
# Reset current speaker
current_speaker = None
# Reset utterance
utterance = ""
# Extract text
text = extract_text(file)
# Iterate through text split on double new lines
for line in text.split("\n"):
# If the line is blank, keep going
if not line:
continue
# Check if one of the speakers is speaking
for speaker in speakers:
if line.startswith(speaker + ":"):
# If the current speaker is the same as this speaker, keep going
if current_speaker == speaker:
break
# Otherwise, we have a new speaker and we want to save the full utterance
# from the previous speaker, set the new speaker, and reset the utterance
# If the current speaker is None, it's our first iteration and there's
# Nothing to save
else:
if current_speaker is None:
assert utterance == "", "Expected utterance to be empty."
else:
speaker_utterances[current_speaker][i].append(utterance)
current_speaker = speaker
utterance = ""
# If we have a current speaker, count the number of words in the line
if current_speaker:
# Handle empty lines, bracketted info, and page numbers
if should_remove(line):
continue
# Remove the speaker name from the line
# Split on any number of whitespaces to define the words in the line
clean_line = line.replace(current_speaker + ":", "")
# Identify mentions of other speakers
for speaker in speakers:
if f"[{speaker}]" in clean_line or f"{speaker}" in clean_line:
speaker_mentions_of[current_speaker][speaker] += 1
# Get words
words = clean_line.split()
# Count the number of words in the line
num_words = len(words)
# Save the word counts
if current_speaker in speaker_counts:
speaker_counts[current_speaker] += num_words
else:
speaker_counts[current_speaker] = num_words
# Add to the utterance which will contain words seperated only by single spaces
utterance += " ".join(clean_line.split())
# Track the total number of words spoken in the file by summing the words from each speaker
word_counts[file] = sum(speaker_counts.values())
# Track the total number of words spoken by each speaker across documents
for speaker in speakers:
if speaker in speaker_counts:
if speaker in speaker_word_counts:
speaker_word_counts[speaker] += speaker_counts[speaker]
else:
speaker_word_counts[speaker] = speaker_counts[speaker]
mean_num_words = statistics.mean(word_counts.values())
std_num_words = statistics.stdev(word_counts.values())
print(f"\nNumber of documents: {len(word_counts)}")
print(f"Mean number of words per document: {mean_num_words:.2f}")
print(f"Standard deviation of number of words per document: {std_num_words:.2f}")
# Note that because we strip text not directly associated with a speaker,
# the counts differ slightly from above
plt.hist(word_counts.values(), bins=10)
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.title("Number of Words Distribution")
plt.show()
return speaker_word_counts, speaker_mentions_of, speaker_utterances
speaker_word_counts_iter2, speaker_mentions_of_iter2, speaker_utterances_iter2 = count_words_per_speaker(
pdf_files=iter2_pdf_files,
speakers=participant_text_names_iter2 + leaders_iter2
)
speaker_word_counts_iter3, speaker_mentions_of_iter3, speaker_utterances_iter3 = count_words_per_speaker(
pdf_files=iter3_pdf_files,
speakers=participants_iter3 + leaders_iter3
)
def add_leader_tag(d, leaders):
d_plot = {}
for k, v in d.items():
if k in leaders:
d_plot[k + " - Leader"] = v
else:
d_plot[k] = v
return d_plot
def plot_total_words_per_speaker(speaker_word_counts, leaders):
speaker_word_counts = add_leader_tag(speaker_word_counts, leaders)
speaker_word_counts = dict(sorted(speaker_word_counts.items(), key=lambda item: item[1]))
plt.barh(list(speaker_word_counts.keys()), speaker_word_counts.values())
plt.title("Words Spoken by Each Speaker Across Sessions")
plt.xlabel("Speaker")
plt.ylabel("Number of Words")
plt.show()
def plot_speaker_mentions_of_other_speakers(speaker_mentions_of, leaders):
speaker_dict = {k + " - Speaker": v for k, v in speaker_mentions_of.items()}
speaker_dict = add_leader_tag(speaker_dict, [l + " - Speaker" for l in leaders])
speaker_matrix = pd.DataFrame(speaker_dict).T.fillna(0)
sb.heatmap(speaker_matrix)
plt.xlabel("Mentioned by Name")
plt.ylabel("Speaker")
plt.title("Count of Speaker Mentions by Name")
plt.show()
def plot_num_utterances_per_speaker_per_session(speaker_utterances, leaders):
# Create plot
fig, ax = plt.subplots()
for key in speaker_utterances:
x = list(speaker_utterances[key].keys())
y = [len(speaker_utterances[key][i]) for i in x]
if key in leaders:
label = key + " - Leader"
else:
label = key
ax.plot(x, y, label=label)
# Set plot title and axis labels
ax.set_title("Number of Utterances per Session per Speaker")
ax.set_xlabel("Session Number")
ax.set_ylabel("Number of Utterances")
# Set legend
ax.legend(fontsize=6, loc="best")
# Show plot
plt.show()
def plot_num_words_per_speaker_per_session(speaker_utterances, leaders):
# Create plot
fig, ax = plt.subplots()
for key in speaker_utterances:
x = list(speaker_utterances[key].keys())
y = [
sum([len(utterance.split()) for utterance in speaker_utterances[key][i]])
for i in x
]
if key in leaders:
label = key + " - Leader"
else:
label = key
ax.plot(x, y, label=label)
# Set plot title and axis labels
ax.set_title("Number of Words per Session per Speaker")
ax.set_xlabel("Session Number")
ax.set_ylabel("Number of Words")
# Set legend
ax.legend(fontsize=6, loc="best")
# Show plot
plt.show()
plot_total_words_per_speaker(speaker_word_counts_iter2, leaders_iter2)
plot_speaker_mentions_of_other_speakers(speaker_mentions_of_iter2, leaders_iter2)
plot_num_utterances_per_speaker_per_session(speaker_utterances_iter2, leaders_iter2)
plot_num_words_per_speaker_per_session(speaker_utterances_iter2, leaders_iter2)
plot_total_words_per_speaker(speaker_word_counts_iter3, leaders_iter3)
plot_speaker_mentions_of_other_speakers(speaker_mentions_of_iter3, leaders_iter3)
plot_num_utterances_per_speaker_per_session(speaker_utterances_iter3, leaders_iter3)
plot_num_words_per_speaker_per_session(speaker_utterances_iter3, leaders_iter3)
def compute_emotion_scores(speaker_utterances, emotions):
xs = []
ys = []
for key in speaker_utterances:
x = list(speaker_utterances[key].keys())
y = {emotion: [] for emotion in emotions}
for i in x:
# Iterate through utterances to obtain emotion scores
word_count = 0
emotion_scores = {emotion: [] for emotion in emotions}
for utterance in speaker_utterances[key][i]:
# Compute utterance length in words
utterance_length = len(utterance.split())
word_count += utterance_length
# Compute sentiment
# To reduce runtime, run inference over batched utterances
output = sent_pipeline(utterance)[0]
# Save normalized emotion scores
for d in output:
# Compute the normalized emotion score
emotion_scores[d["label"]].append(d["score"] * utterance_length)
# Add the scaled emotion score for the emotion of interest
for emotion in emotions:
# Get the scores for the given emotion
scores = emotion_scores[emotion]
# Scale the scores
if word_count == 0:
scaled_score = 0
else:
scaled_score = sum(scores) / word_count
# Save the scaled score
y[emotion].append(scaled_score)
xs.append(x)
ys.append(y)
return xs, ys
iter2_xs, iter2_ys = compute_emotion_scores(speaker_utterances=speaker_utterances_iter2, emotions=emotions)
iter3_xs, iter3_ys = compute_emotion_scores(speaker_utterances=speaker_utterances_iter3, emotions=emotions)
def plot_emotion_trend(emotion, speaker_utterances, xs, ys, leaders):
# Create plot
fig, ax = plt.subplots()
# Iterate through sessions
for key, x, y in zip(speaker_utterances, xs, ys):
if key in leaders:
label = key + " - Leader"
else:
label = key
# Plot emotion scores for the session for the given participant
ax.plot(x, y[emotion], label=label)
# Set plot title and axis labels
ax.set_title(f"Score for Emotion: {emotion}")
ax.set_xlabel("Session Number")
ax.set_ylabel("Session Emotion Score Normalized by Utterance Lengths")
# Set legend
ax.legend(fontsize=6, loc="best")
# Show plot
plt.show()
for emotion in emotions:
plot_emotion_trend(
emotion,
speaker_utterances=speaker_utterances_iter2,
xs=iter2_xs,
ys=iter2_ys,
leaders=leaders_iter2
)
for emotion in emotions:
plot_emotion_trend(
emotion,
speaker_utterances=speaker_utterances_iter3,
xs=iter3_xs,
ys=iter3_ys,
leaders=leaders_iter3
)